Project: run1-v0.0.28.
This report is meant to help explore the results of the derfinder (Collado-Torres et al. 2013) package and was generated using the derfinderReport (Collado-Torres et al. 2013b) package. While the report is rich, it is meant to just start the exploration of the results and exemplify some of the code used to do so. You will most likely need a more in-depth analysis for your specific data set.
Most plots were made with using ggplot2 (Wickham, 2009).
#### Libraries needed
## Bioconductor
library("IRanges")
library("GenomicRanges")
if(hg19) {
library("biovizBase")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
}
## CRAN
library("ggplot2")
library("gridExtra")
library("data.table")
library("knitcitations")
library("xtable")
library("RColorBrewer")
## GitHub
library("derfinder")
library("knitrBootstrap")
library("rCharts")
#### Code setup
## For ggplot
tmp <- fullRegions
names(tmp) <- seq_len(length(tmp))
regions.df <- as.data.frame(tmp)
regions.df$width <- width(tmp)
rm(tmp)
nulls.df <- as.data.frame(fullNullSummary)
## Get chr lengths
if(hg19) {
data(hg19Ideogram, package = "biovizBase")
seqlengths(fullRegions) <- seqlengths(hg19Ideogram)[names(seqlengths(fullRegions))]
}
## Find which chrs are present in the data set
chrs <- levels(seqnames(fullRegions))
names(chrs) <- gsub("chr", "", chrs)
## Subset the fullCoverage data in case that a subset was used
colsubset <- optionsStats$colsubset
if(!is.null(fullCov) & !is.null(colsubset)) {
fullCov <- lapply(fullCov, function(x) { x[, colsubset] })
}
## Get region coverage for the top regions
if(nBestRegions > 0) {
regionCoverage <- getRegionCoverage(fullCov=fullCov, regions=fullRegions[seq_len(nBestRegions)], calculateMeans=FALSE, verbose=FALSE)
save(regionCoverage, file=file.path(workingDir, "regionCoverage.Rdata"))
}
Theoretically, the p-values should be uniformly distributed between 0 and 1.
p1 <- ggplot(regions.df, aes(x=pvalues, colour=seqnames)) + geom_line(stat="density") + xlim(0, 1) + labs(title="Density of p-values") + xlab("p-values") + scale_colour_discrete(limits=chrs) + theme(legend.title=element_blank())
p1
## Compare the pvalues summary(fullRegions$pvalues)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.0012 0.0264 0.1870 0.3050 0.5450 1.0000
This is the numerical summary of the distribution of the p-values.
summary(fullRegions$qvalues)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.0049 0.0618 0.2210 0.2460 0.4300 0.5920
This is the numerical summary of the distribution of the q-values.
qtable <- lapply(c(1e-04, 0.001, 0.01, 0.025, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), function(x) {
data.frame("Cut" = x, "Count" = sum(fullRegions$qvalues <= x))
})
qtable <- do.call(rbind, qtable)
print(xtable(qtable, digits=4, align=rep("c", 3)), type="html", include.rownames=FALSE)
| Cut | Count |
|---|---|
| 0.0001 | 0 |
| 0.0010 | 0 |
| 0.0100 | 3333 |
| 0.0250 | 4881 |
| 0.0500 | 5788 |
| 0.1000 | 8496 |
| 0.2000 | 11622 |
| 0.3000 | 14791 |
| 0.4000 | 16930 |
| 0.5000 | 19613 |
| 0.6000 | 24031 |
| 0.7000 | 24031 |
| 0.8000 | 24031 |
| 0.9000 | 24031 |
| 1.0000 | 24031 |
This table shows the number of candidate Differentially Expressed Regions (DERs) with q-value less or equal than some commonly used cutoff values.
xrange <- range(log10(regions.df$width))
p2a <- ggplot(regions.df, aes(x=log10(width), colour=seqnames)) + geom_line(stat="density") + labs(title="Density of region lengths") + xlab("Region width (log10)") + scale_colour_discrete(limits=chrs) + xlim(xrange) + theme(legend.title=element_blank())
p2b <- ggplot(regions.df[idx.sig, ], aes(x=log10(width), colour=seqnames)) + geom_line(stat="density") + labs(title="Density of region lengths (significant only)") + xlab("Region width (log10)") + scale_colour_discrete(limits=chrs) + xlim(xrange) + theme(legend.title=element_blank())
grid.arrange(p2a, p2b)
This plot shows the density of the region lengths for all regions. The bottom panel is restricted to significant regions (q-value < 0.1)
xrange <- range(log10(regions.df$area))
p3a <- ggplot(regions.df, aes(x=log10(area), colour=seqnames)) + geom_line(stat="density") + labs(title="Density of region areas") + xlab("Region area (log10)") + scale_colour_discrete(limits=chrs) + xlim(xrange) + theme(legend.title=element_blank())
p3b <- ggplot(regions.df[idx.sig, ], aes(x=log10(area), colour=seqnames)) + geom_line(stat="density") + labs(title="Density of region areas (significant only)") + xlab("Region area (log10)") + scale_colour_discrete(limits=chrs) + xlim(xrange) + theme(legend.title=element_blank())
grid.arrange(p3a, p3b)
This plot shows the density of the region areas for all regions. The bottom panel is restricted to significant regions (q-value < 0.1)
p4 <- ggplot(nulls.df, aes(x=log10(width), colour=chr)) + geom_line(stat="density") + labs(title="Density of null region lengths") + xlab("Region width (log10)") + scale_colour_discrete(limits=chrs) + theme(legend.title=element_blank())
p5 <- ggplot(nulls.df, aes(x=log10(area), colour=chr)) + geom_line(stat="density") + labs(title="Density of null region areas") + xlab("Region area (log10)") + scale_colour_discrete(limits=chrs) + theme(legend.title=element_blank())
grid.arrange(p4, p5)
This plot shows the density of the null region lengths and areas. There were a total of 870 null regions.
xrange <- range(log2(regions.df$meanCoverage))
p6a <- ggplot(regions.df, aes(x=log2(meanCoverage), colour=seqnames)) + geom_line(stat="density") + labs(title="Density of region mean coverage") + xlab("Region mean coverage (log2)") + scale_colour_discrete(limits=chrs) + xlim(xrange) + theme(legend.title=element_blank())
p6b <- ggplot(regions.df[idx.sig, ], aes(x=log2(meanCoverage), colour=seqnames)) + geom_line(stat="density") + labs(title="Density of region mean coverage (significant only)") + xlab("Region mean coverage (log2)") + scale_colour_discrete(limits=chrs) + xlim(xrange) + theme(legend.title=element_blank())
grid.arrange(p6a, p6b)
This plot shows the density of the region mean coverage for all regions. The bottom panel is restricted to significant regions (q-value < 0.1)
The following plots are MA-style plots comparing each group vs the first one. The mean coverage is calculated using only two groups at a time and is weighted according to the number of samples on each group. Note that the mean coverage and fold change as calculated here do not taking into account the library sizes.
These plots are only shown when there are two or more groups. A total of 1 plot(s) were made.
for(j in grep("foldChange", colnames(values(fullRegions)))) {
## Identify the groups
groups <- strsplit(gsub("foldChange", "", colnames(values(fullRegions))[j]), "vs")[[1]]
## Calculate the mean coverage only using the 2 groups in question
j.mean <- which(colnames(values(fullRegions)) %in% paste0("mean", groups))
groups.n <- sapply(groups, function(x) { sum(optionsStats$groupInfo == x) })
ma.mean.mat <- as.matrix(values(fullRegions)[, j.mean])
## Weighted means
ma.mean <- drop(ma.mean.mat %*% groups.n) / sum(groups.n)
ma <- data.frame(mean=ma.mean, foldChange=values(fullRegions)[, j])
ma2 <- ma[is.finite(ma$foldChange), ]
fold.mean <- data.frame(foldMean=mean(ma2$foldChange, na.rm=TRUE))
p.ma <- ggplot(ma, aes(x=log2(mean), y=foldChange)) + geom_point(size=1.5, alpha=1/5) + ylab("Fold Change (log2)\nRed dashed line at mean; blue line is loess fit") + xlab(paste("Mean coverage (log2) using only groups", groups[1], "and", groups[2])) + labs(title=paste("MA style plot:", groups[1], "vs ", groups[2])) + geom_hline(aes(yintercept=foldMean), data=fold.mean, colour="#990000", linetype="dashed") + geom_smooth(aes(y=foldChange, x=log2(mean)), data=subset(ma2, mean > 0), method=loess)
print(p.ma)
}
The following plots were made using ggbio (Yin et al. 2012) which in turn uses ggplot2 (Wickham, 2009). For more details check plotOverview in derfinder (Collado-Torres et al. 2013).
plotOverview(regions=fullRegions, type="qval", base_size=30, areaRel=10, legend.position=c(0.97, 0.12))
This plot shows the genomic locations of the candidate regions found in the analysis. The significant regions (q-value less than 0.1) are highlighted and the area of the regions is shown on top of each chromosome. Note that the area is in a relative scale.
plotOverview(regions=fullRegions, annotation=fullRegions, type="annotation", base_size=30, areaRel=10, legend.position=c(0.97, 0.12))
This genomic overview plot shows the annotation region type for the candidate regions. Note that the regions are shown only if the annotation information is available. Below is a table of the actual number of results per annotation region type.
annoReg <- table(fullRegions$region, useNA="always")
annoReg.df <- data.frame(Region=names(annoReg), Count=as.vector(annoReg))
print(xtable(annoReg.df, align=rep("c", 3)), type="html", include.rownames=FALSE)
| Region | Count |
|---|---|
| upstream | 2046 |
| promoter | 121 |
| overlaps 5' | 9 |
| inside | 15587 |
| overlaps 3' | 9 |
| close to 3' | 54 |
| downstream | 1783 |
| covers | 0 |
| 4422 |
plotOverview(regions=fullRegions[idx.sig], annotation=fullRegions[idx.sig], type="annotation", base_size=30, areaRel=10, legend.position=c(0.97, 0.12))
This genomic overview plot shows the annotation region type for the candidate regions that have a q-value less than 0.1. Note that the regions are shown only if the annotation information is available.
Below are the plots for the top 100 candidate DERs as ranked by area. For each plot, annotation is shown if the candidate DER has a minimum overlap of 20 base pairs with annotation information (strand specific). If present, exons are collapsed and shown in blue. Introns are shown in light blue. The title of each plot is composed of the name of the nearest annotation element, the distance to it, and whether the region of the genome the DER falls into; all three pieces of information are based on bumphunter::annotateNearest().
The annotation depends on the Genomic State used. For details on which one was used for this report check the call to mergeResults in the reproducibility details.
if(nBestRegions > 0) {
plotRegionCoverage(regions=fullRegions, regionCoverage=regionCoverage, groupInfo=optionsStats$groupInfo, nearestAnnotation=regions.df, annotatedRegions=fullAnnotatedRegions, whichRegions = seq_len(min(nBestRegions, length(fullRegions))), colors=NULL, scalefac = 32, ask = FALSE, verbose = TRUE)
}
Below is a table summarizing the number of genomic states per region.
info <- do.call(rbind, lapply(fullAnnotatedRegions$countTable, function(x) { data.frame(table(x)) }))
colnames(info) <- c("Number of Overlapping States", "Frequency")
info$State <- gsub("\\..*", "", rownames(info))
rownames(info) <- NULL
print(xtable(info, align=rep("c", 4)), type="html", include.rownames=FALSE)
| Number of Overlapping States | Frequency | State |
|---|---|---|
| 0 | 22118 | exon |
| 1 | 1803 | exon |
| 2 | 110 | exon |
| 0 | 22892 | intragenic |
| 1 | 1139 | intragenic |
| 0 | 23132 | intron |
| 1 | 861 | intron |
| 2 | 38 | intron |
Below is an interactive table with the top 2000 regions (out of 24031) as ranked by area. This table was generated using rCharts (Vaidyanathan, 2013). Inf and -Inf are shown as 1e100 and -1e100 respectively.
topArea <- head(regions.df, 2000)
topArea <- data.frame("areaRank"=order(topArea$area, decreasing=TRUE), topArea)
## Clean up -Inf, Inf if present
## More details at https://github.com/ramnathv/rCharts/issues/259
replaceInf <- function(df, colsubset=seq_len(ncol(df))) {
for(i in colsubset) {
inf.idx <- !is.finite(df[, i])
if(any(inf.idx)) {
inf.sign <- sign(df[inf.idx, i])
df[inf.idx, i] <- inf.sign * 1e100
}
}
return(df)
}
topArea <- replaceInf(topArea, grep("foldChange", colnames(topArea)))
## Make the table
d <- data.table(topArea)
t1 <- dTable(d, sPaginationType= 'full_numbers', iDisplayLength=10, sScrollX='100%')
t1$print("regions", cdn=TRUE)
The following plots were made using ggbio (Yin et al. 2012) which in turn uses ggplot2 (Wickham, 2009). For more details check plotRegion in derfinder (Collado-Torres et al. 2013).
## Select clusters by cluster area
df <- data.frame(area=fullRegions$area, clusterChr=paste0(as.integer(fullRegions$cluster), chr=as.character(seqnames(fullRegions))))
regionClustAreas <- tapply(df$area, df$clusterChr, sum)
bestArea <- sapply(names(head(sort(regionClustAreas, decreasing=TRUE), nBestClusters)), function(y) { which(df$clusterChr == y)[[1]]})
## Graphical setup: ideograms
if(hg19 & is.null(p.ideos)) {
## Load ideogram info
data(hg19IdeogramCyto, package = "biovizBase")
ideos.set <- as.character(unique(seqnames(fullRegions[bestArea])))
p.ideos <- lapply(ideos.set, function(xx) {
plotIdeogram(hg19IdeogramCyto, xx)
})
names(p.ideos) <- ideos.set
} else {
stopifnot(!is.null(p.ideos))
}
## Error: could not find function "plotIdeogram"
## Graphical setup: transcription database
if(hg19 & is.null(txdb)) {
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
} else {
stopifnot(!is.null(txdb))
}
## Graphical setup: main plotting function
regionClusterPlot <- function(idx, tUse="qval") {
## Chr specific selections
chr <- as.character(seqnames(fullRegions[idx]))
chrnum <- gsub("chr", "", chr)
p.ideo <- p.ideos[[chr]]
covInfo <- fullCov[[chrnum]]
## Make the plot
p <- plotCluster(idx, regions=fullRegions, annotation=regions.df, coverageInfo=covInfo, groupInfo=optionsStats$groupInfo, titleUse=tUse, txdb=txdb, p.ideogram=p.ideo)
print(p)
rm(p.ideo, covInfo)
return(invisible(TRUE))
}
Below are the best 20 region clusters ordered by cluster area (sum of the area of regions inside a cluster). The region with the highest area in the cluster is shown with a red bar.
## Genome plots
for(idx in bestArea) {
regionClusterPlot(idx, ifelse(nullExist, "qval", "none"))
}
Below is the information on how the samples were permutted.
## Get the permutation information
nSamples <- seq_len(length(optionsStats$groupInfo))
permuteInfo <- lapply(seeds, function(x) {
set.seed(x)
idx <- sample(nSamples)
data.frame(optionsStats$groupInfo[idx])
})
permuteInfo <- cbind(data.frame(optionsStats$groupInfo), do.call(cbind, permuteInfo))
colnames(permuteInfo) <- c("original", paste0("perm", 1:10))
## The raw information
# permuteInfo
n <- names(table(permuteInfo[, 2]))
permuteDetail <- data.frame(matrix(NA, nrow=10*length(n), ncol = 2 + length(n)))
permuteDetail[, 1] <- rep(1:10, each=length(n))
permuteDetail[, 2] <- rep(n, 10)
colnames(permuteDetail) <- c("permutation", "group", as.character(n))
l <- 1
m <- 3:ncol(permuteDetail)
for(j in n) {
k <- which(permuteInfo[, 1] == j)
for(i in 2:11) {
permuteDetail[l, m] <- table(permuteInfo[k, i])
l <- l + 1
}
}
## Print the summary
summary(permuteDetail[, m])
## CEU YRI ## Min. : 6.0 Min. :1.0 ## 1st Qu.: 7.5 1st Qu.:3.5 ## Median :10.5 Median :5.0 ## Mean :10.5 Mean :5.0 ## 3rd Qu.:13.5 3rd Qu.:6.5 ## Max. :15.0 Max. :9.0
This table shows the summary per group of and can be used for fast detection of anomalies.
Note that in derfinder (Collado-Torres et al. 2013) the re-sampling of the samples is done without replacement. This is done in an effort to avoid singular model matrices. While the sample balance is the same across the permutations, what changes are the adjusted variables (including the column medians).
The following table shows how the group labels were permuted. This can be useful to detect whether a permutation in particular had too many samples of a group labeled as another group, meaning that the resulting permuted group label resulted in pretty much a name change.
d2 <- data.table(permuteDetail)
t2 <- dTable(d2, sPaginationType= 'full_numbers', iDisplayLength=10, sScrollX='100%')
t2$print("permutation")
This report is part of derfinder (Collado-Torres et al. 2013) and was generated using knitr (Xie, 2013) and styled via knitrBootstrap (Hester, 2013).
Citations made with knitcitations (Boettiger, 2013).
The F-statistic cutoff used was 1e-05 and type of cutoff used was theoretical. Furthermore, the maximum region (data) gap was set to 0 and the maximum cluster gap was set to 3000.
This analysis was on each chromosome was performed with the following call to analyzeChr() (shown for one chromosome only):
## analyzeChr(chrnum = opt$chr, coverageInfo = data, models = models, ## cutoffFstat = 1e-05, cutoffType = "theoretical", nPermute = 10, ## maxClusterGap = 3000, groupInfo = groupInfo, subject = "hg19", ## mc.cores = opt$mcores, verbose = opt$verbose)
The results were merged using the following call to mergeResults():
## mergeResults(prefix = "run1-v0.0.28", genomicState = GenomicState.Hsapiens.UCSC.hg19.knownGene)
This report was generated in path /amber2/scratch/lcollado/derfinderExample/derAnalysis using the following call to generateReport():
## generateReport(prefix = "run1-v0.0.28", browse = FALSE, nBestClusters = 20, ## fullCov = fullCov, device = "CairoPNG")
Date the report was generated.
## [1] "2013-10-01 03:29:16 EDT"
Wallclock time spent generating the report.
## Time difference of 23.57 mins
R session information.
## R version 3.0.1 Patched (2013-05-16 r62754) ## Platform: x86_64-unknown-linux-gnu (64-bit) ## ## locale: ## [1] LC_CTYPE=en_US.iso885915 LC_NUMERIC=C ## [3] LC_TIME=en_US.iso885915 LC_COLLATE=en_US.iso885915 ## [5] LC_MONETARY=en_US.iso885915 LC_MESSAGES=en_US.iso885915 ## [7] LC_PAPER=C LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=en_US.iso885915 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] grid parallel methods stats graphics grDevices utils ## [8] datasets base ## ## other attached packages: ## [1] rCharts_0.3.53 ## [2] knitrBootstrap_0.8.0 ## [3] markdown_0.6.3 ## [4] RColorBrewer_1.0-5 ## [5] xtable_1.7-1 ## [6] knitcitations_0.4-7 ## [7] bibtex_0.3-6 ## [8] knitr_1.4.1 ## [9] data.table_1.8.8 ## [10] gridExtra_0.9.1 ## [11] ggplot2_0.9.3.1 ## [12] TxDb.Hsapiens.UCSC.hg19.knownGene_2.9.0 ## [13] GenomicFeatures_1.12.1 ## [14] AnnotationDbi_1.22.5 ## [15] Biobase_2.20.0 ## [16] biovizBase_1.8.0 ## [17] GenomicRanges_1.12.3 ## [18] IRanges_1.18.1 ## [19] BiocGenerics_0.6.0 ## [20] derfinderReport_0.0.1 ## [21] derfinder_0.0.28 ## [22] RcppArmadillo_0.3.820 ## [23] Rcpp_0.10.4 ## ## loaded via a namespace (and not attached): ## [1] biomaRt_2.16.0 Biostrings_2.28.0 ## [3] bitops_1.0-5 BSgenome_1.28.0 ## [5] Cairo_1.5-2 cluster_1.14.4 ## [7] colorspace_1.2-2 DBI_0.2-7 ## [9] dichromat_2.0-0 digest_0.6.3 ## [11] evaluate_0.4.7 formatR_0.9 ## [13] ggbio_1.8.5 gtable_0.1.2 ## [15] Hmisc_3.10-1.1 httr_0.2 ## [17] labeling_0.1 lattice_0.20-15 ## [19] MASS_7.3-26 munsell_0.4 ## [21] plyr_1.8 proto_0.3-10 ## [23] qvalue_1.34.0 RCurl_1.95-4.1 ## [25] reshape2_1.2.2 RJSONIO_1.0-3 ## [27] Rsamtools_1.12.3 RSQLite_0.11.3 ## [29] rtracklayer_1.20.2 scales_0.2.3 ## [31] stats4_3.0.1 stringr_0.6.2 ## [33] tcltk_3.0.1 tools_3.0.1 ## [35] VariantAnnotation_1.6.5 whisker_0.3-2 ## [37] XML_3.96-1.1 yaml_2.1.7 ## [39] zlibbioc_1.6.0